
# Figure_5.R
# Aim: to reproduce Figure 5 of Atsusaka and Holbrook (2026)

library(tidyverse)

# read data --------------------------------------------------------------------

raw <- read_csv("archive-candidate.csv")
meta <- read_csv("archive-election.csv")


meta %>%
  group_by(election_id) %>%
  filter(n() > 1) %>%
  arrange(election_id) -> dup

print(dup, n = Inf)

raw <- raw %>%
  left_join(meta, by = "election_id") %>%
  mutate(election_id = gsub("ParkandRecreationCommissioner", "ParkBoard", election_id),
         election_id = gsub("CouncilMember", "Council", election_id),
         election_id = gsub("X_Minneapolis_At_Large_ParkBoard", "G_Minneapolis_At_Large_ParkBoard", election_id),
         election_id = gsub("BoardofEstimateandTaxation", "Board_Of_Estimate_Taxation", election_id),
         election_id = gsub("Board_Of_Estimate_TaxationMember", "Board_Of_Estimate_Taxation", election_id),
         )

## Validation based on Minneapolis data ----------------------------------------
MN_archive_first <- raw %>%
  filter(grepl("MN", election_id),
         round == 1)

# Only first-choice votes are reported for single-round contests
MN_truth <- read_csv("minneapolis_scraped_election_results.csv") %>%
  mutate(state = "MN",
         name = candidate,
#         district = ifelse(district == "At_Large", "At-Large", district),
         district = ifelse(grepl("Ward", district), gsub("\\D", "", district), district),
         district = ifelse(grepl("District", district), gsub("\\D", "", district), district),
         district = ifelse(str_detect(district, "^[0-9]$"), str_pad(district, 2, pad = "0"), district),
         office = case_when(office == "City_Council" ~ "Council",
                            office == "CouncilMember" ~ "Council",
                            office == "Park_Board" ~ "ParkBoard",
                            TRUE ~ office),
         election_id = paste0("MN", "_", year, "_", "G", "_", "Minneapolis",
                              "_", district, "_", office)) %>%
  select(election_id, year, state, office, district,
         name,
         first_choice_votes,
         final_round_votes) %>%
  arrange(election_id, name, first_choice_votes, final_round_votes) %>%
  distinct(election_id, name, .keep_all = TRUE)


vec <- sort(unique(MN_archive_first$election_id))
vec2 <- sort(unique(MN_truth$election_id))

setdiff(vec2, vec)
setdiff(vec, vec2)


out_archive <- MN_archive_first %>%
  filter(election_id %in% vec2)


length(unique(out_archive$election_id))



# Quantify ---------------------------------------------------------------------

MN_validate_first <- map_dfr(vec, function(eid) {
  t <- MN_truth %>%
    filter(election_id == eid) %>%
    select(election_id, name, first_choice_votes) %>%
    arrange(first_choice_votes) %>%
    filter(!grepl("Tabulation", name))

  a <- MN_archive_first %>%
    filter(election_id == eid) %>%
    arrange(votes) %>%
    distinct(election_id, name, .keep_all = TRUE) %>%
    select(election_id, name, votes, round, n_rounds)

  a %>%
    left_join(t, by = c("election_id", "name")) %>%
    mutate(diff = votes - first_choice_votes)
})


summary(MN_validate_first$diff)

MN_validate_first <- MN_validate_first %>%
  filter(!is.na(diff))

table(MN_validate_first$diff, useNA = "always")


 MN_validate_first %>%
  filter(diff != 0)


 # Quantify: final round -------------------------------------------------------
 MN_archive_final <- raw %>%
   filter(grepl("MN", election_id),
          round == n_rounds)

 vec <- sort(unique(MN_archive_final$election_id))
 vec2 <- sort(unique(MN_truth$election_id))

 out_archive <- MN_archive_final %>%
   filter(election_id %in% vec2)





 MN_validate_final <- map_dfr(vec, function(eid) {
   t <- MN_truth %>%
     filter(election_id == eid) %>%
     select(election_id, name, final_round_votes) %>%
     arrange(final_round_votes) %>%
     filter(!grepl("Tabulation", name))

   a <- MN_archive_final %>%
     filter(election_id == eid) %>%
     arrange(votes) %>%
     distinct(election_id, name, .keep_all = TRUE) %>%
     select(election_id, name, votes, round, n_rounds)

   a %>%
     left_join(t, by = c("election_id", "name")) %>%
     mutate(diff = votes - final_round_votes)
 })


 summary(MN_validate_final$diff)

 MN_validate_final <- MN_validate_final %>%
   filter(!is.na(diff))

 table(MN_validate_final$diff, useNA = "always")


 MN_validate_final %>%
   filter(diff != 0)



# Visualize --------------------------------------------------------------------

MN_validate_first <- MN_validate_first %>%
   mutate(round = "Round 1",
          ground_truth = first_choice_votes) %>%
   dplyr::select(-first_choice_votes)

MN_validate_final <- MN_validate_final %>%
  mutate(round = "Round Final",
         ground_truth = final_round_votes) %>%
  dplyr::select(-final_round_votes)

MN_validate_all <- rbind(MN_validate_first, MN_validate_final)


# Step 1: Calculate match percentage by round
match_rate <- MN_validate_all %>%
  mutate(match = votes == ground_truth) %>%
  group_by(round) %>%
  summarize(
    match_percent = mean(match) * 100,
    # Set fixed coordinates for text placement (adjust if needed)
    label_x = 1000,
    label_y = 19000
  )

# Step 2: Create the plot
MN_validate_all %>%
  ggplot(aes(y = ground_truth, x = votes)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  geom_point(color = "darkcyan", pch = 16, size = 2, alpha = 0.5) +
  geom_text(
    data = match_rate,
    aes(x = label_x, y = label_y, label = paste0(round(match_percent), "% match")),
    inherit.aes = FALSE,
    size = 4,
    hjust = 0
  ) +
  xlim(0, 20000) +
  ylim(0, 20000) +
  ylab("Votes (scraped data)") +
  xlab("Votes (our dataset)") +
  facet_wrap(~ round) +
  theme_bw() +
  ggtitle("")


ggsave("Figure_5.pdf", width = 6, height = 3.2)


